Load bar business ids, and load the review dataset into pandas


In [1]:
import pandas as pd
import pickle
import numpy as np

# Load the bar review dataset 
review = pd.read_pickle('../output/bar_restaurant_reviews_cleaned_and_tokenized.pickle')
review.head(5)


Out[1]:
business_id date review_id stars text type user_id votes_cool votes_funny votes_useful cleaned_tokenized
10 UsFtqoBl7naz8AVUBZMjQQ 2013-11-08 Di3exaUCFNw1V4kSNW5pgA 5 All the food is great here. But the best thing... review uK8tzraOp4M5u3uYrqIBXg 0 0 0 [[food, great], [best, thing, wing], [wing, si...
11 UsFtqoBl7naz8AVUBZMjQQ 2014-03-29 0Lua2-PbqEQMjD9r89-asw 3 We checked this place out this past Monday for... review I_47G-R2_egp7ME5u_ltew 0 0 0 [[checked, place, past, monday, wing-night], [...
12 UsFtqoBl7naz8AVUBZMjQQ 2014-10-29 7N9j5YbBHBW6qguE5DAeyA 2 Wing sauce is like water. Pretty much a lot of... review PP_xoMSYlGr2pb67BbqBdA 0 0 0 [[wing, sauce, like, water], [pretty, much, a-...
13 UsFtqoBl7naz8AVUBZMjQQ 2014-11-28 mjCJR33jvUNt41iJCxDU_g 4 Cold cheap beer. Good bar food. Good service. ... review JPPhyFE-UE453zA6K0TVgw 1 1 1 [[cold, cheap, beer], [good, bar, food], [good...
22 mVHrayjG3uZ_RLHkLj-AMg 2012-12-01 6w6gMZ3iBLGcUM4RBIuifQ 5 This place was DELICIOUS!! My parents saw a r... review LWbYpcangjBMm4KPxZGOKg 0 0 5 [[place, delicious], [parent, saw, recommendat...

In [63]:
# Now let's generate a word2vec trained model on the dataset.
# First we need to override the simple weighting scheme

In [76]:
import gensim
from itertools import chain
import sys
sys.path.append('../vectorsearch/')
import nltk_helper
import word2vec



def create_vector_model(model, tokenized_docs, **kwargs):
    """
    Create gensim Word2Vec model out of review list
    where each element contains review
    """
    review_flatten = list(chain.from_iterable(tokenized_docs))
    print 'training word2vec model...'
    vec_model = model(review_flatten, **kwargs)
    return vec_model



# Arguments to the word2vec model
model_args = {'size':200, 'window':5, 'min_count':5, 'workers':12, 'iter':10}


word2vec_model = create_vector_model(model=word2vec.Word2Vec, 
                                     tokenized_docs=review.cleaned_tokenized.iloc[:],
                                     **model_args)
# Done training, so this reduces ram footprint.
word2vec_model.init_sims(replace=True)


training word2vec model...

In [ ]:


In [77]:
word2vec_model.save('../output/word2vec_bars_and_restaurants.model')

In [40]:
import sys
sys.path.append('../vectorsearch/')
import nltk_helper
import word2vec

word2vec_model = word2vec.Word2Vec.load('../output/word2vec_bars_and_restaurants.model')

In [44]:
from query import parse_query
query_dict = parse_query('steak:1; ocean:2 ; land:1' )


for word, sim in word2vec_model.most_similar(query_dict, topn=20):
    print np.dot(word2vec_model[word], word2vec_model[word])


#print word2vec_model.most_similar({'pepperoni':10, 'cheese':1000000})


1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0

In [ ]:


In [23]:



Out[23]:
True

In [43]:
# model_args = {'num_topics':100}
# lda_model = create_vector_model(model=gensim.models.LdaModel, review_list=yelp_review_sample, **model_args)

#model.similarity('bar')
#model.most_similar('bar', topn=20)


breaking into sentence...
training word2vec model...

In [38]:
# from gensim import corpora, models, similarities
# model = models.ldamodel.LdaModel(yelp_review_sample, num_topics=10)

In [75]:



All the food is great here. But the best thing they have is their wings. Their wings are simply fantastic!!  The "Wet Cajun" are by the best & most popular.  I also like the seasoned salt wings.  Wing Night is Monday & Wednesday night, $0.75 whole wings!

The dining area is nice. Very family friendly! The bar is very nice is well.  This place is truly a Yinzer's dream!!  "Pittsburgh Dad" would love this place n'at!!

In [ ]:


In [109]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features,
                                   stop_words='english')


from time import time
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(yelp_review_sample)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))


done in 23.743s.
Extracting tf features for LDA...

NameErrorTraceback (most recent call last)
<ipython-input-109-ad2d84a2d83a> in <module>()
     14 # Use tf (raw term count) features for LDA.
     15 print("Extracting tf features for LDA...")
---> 16 tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
     17                                 stop_words='english')
     18 t0 = time()

NameError: name 'n_features' is not defined

In [ ]:
lda.fit(yelp_review_sample)

In [40]:
import calculator
reload(calculator)

calc = calculator.word2vec_calc(word2vec_model)
calc.calc('dog+dog')



AttributeErrorTraceback (most recent call last)
<ipython-input-40-aeaa6285dd18> in <module>()
      4 
      5 calc = calculator.word2vec_calc(word2vec_model)
----> 6 calc.calc('dog+dog')

/data/insight_yelp/vectorsearch/calculator.pyc in calc(self, expr)
     80         tokens = [Token(token_map.get(x, 'NUM'), x) for x in split_expr]
     81         tree = self.match('add', tokens)[0]
---> 82         tree = self.flatten_right_associativity( tree )
     83         return self.evaluate(tree)
     84 

/data/insight_yelp/vectorsearch/calculator.pyc in flatten_right_associativity(self, tree)
     69         return map(func, tree.matched) if tree.name in rule_map else tree[1]
     70     def flatten_right_associativity(self, tree):
---> 71         new = self._recurse_tree(tree, self.flatten_right_associativity)
     72         if tree.name in fix_assoc_rules and len(new)==3 and new[2].name==tree.name:
     73             new[-1:] = new[-1].matched

/data/insight_yelp/vectorsearch/calculator.pyc in _recurse_tree(self, tree, func)
     67 
     68     def _recurse_tree(self, tree, func):
---> 69         return map(func, tree.matched) if tree.name in rule_map else tree[1]
     70     def flatten_right_associativity(self, tree):
     71         new = self._recurse_tree(tree, self.flatten_right_associativity)

AttributeError: 'NoneType' object has no attribute 'name'

In [ ]: